Provides metrics for the trained model. The final model in production is trained on the complete dataset.
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
#from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score, recall_score, precision_score, brier_score_loss
#from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.pipeline import Pipeline
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.metrics import classification_report_imbalanced
from yellowbrick.classifier import ClassificationReport, ConfusionMatrix, DiscriminationThreshold, ClassPredictionError, PrecisionRecallCurve
from yellowbrick.model_selection import CVScores
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
df_train = pd.read_csv(r'D:\Python ML\SA_Geology_protolith_predictions\data\external\classifier_data_2019-02-26_ilr_training.csv',encoding = "ISO-8859-1", engine='python') #training set
df_test = pd.read_csv(r'D:\Python ML\SA_Geology_protolith_predictions\data\external\classifier_data_2019-02-26_ilr_validation.csv',encoding = "ISO-8859-1", engine='python') #validation set
df_train.head(2)
df_train['rock_group'].value_counts()
df_test['rock_group'].value_counts()
X_train, y_train = df_train[['sio2','tio2','al2o3','feo_tot','mgo','cao','na2o','k2o','p2o5']], df_train[['rock_group']].replace(['igneous','sedimentary'],[0,1])
X_test, y_test = df_test[['sio2','tio2','al2o3','feo_tot','mgo','cao','na2o','k2o','p2o5']], df_test[['rock_group']].replace(['igneous','sedimentary'],[0,1])
print('Training data')
print(X_train.shape)
print(y_train.shape)
print('\n')
print('Validation data')
print(X_test.shape)
print(y_test.shape)
Model selection was first done via testing multiple different classifiers using the pycaret, sklearn and imblearn libraries. The final classifer was then tuned via a GridSearchCV to get the best hyperparameters. The model hyperparameters were then tweaked to reduce the final file size of the classfier down from 2.5Gb to ~100Mb. During this process the model scores (f1, precision and recall) and confusion matrix were monitored to ensure minimal reduction in the model performance.
def train_model(X: pd.DataFrame, y: pd.DataFrame) -> Pipeline:
"""
Function to train the protolith classification model.
Paramaters:
X: training data, default X_train
y: training labels, default y_train
returns
trained model pipeline
"""
# Create a pipeline to scale and train a calibrated balanced random forrest classifier
cv = StratifiedKFold(n_splits=5, random_state=101)
clsf_pipe = Pipeline([('sc', StandardScaler()),
('classifier', CalibratedClassifierCV(base_estimator = BalancedRandomForestClassifier(n_estimators= 50, max_depth=15, min_samples_leaf= 1, min_samples_split= 2,
max_features= 'sqrt',sampling_strategy='not minority', n_jobs= -1, random_state= 101), method= 'sigmoid', cv=cv))])
# fit the model
model_pipe = clsf_pipe.fit(X, y)
return model_pipe
Model50_15_full = train_model(X_train, y_train)
Model50_15_full.steps
vis = ClassificationReport(Model50_15_full, classes=['Igneous','Sedimentary'],support=True, is_fitted=True, title='Tuned Balanced Random Forest Classification Rport')
vis.fit(X_train,y_train)
vis.score(X_test,y_test)
vis.show()
cm = ConfusionMatrix(Model50_15_full,classes=['Igneous','Sedimentary'],is_fitted=True, title='Tuned Balanced Random Forest CM')
cm.fit(X_train,y_train)
cm.score(X_test,y_test)
cm.show()
pr = PrecisionRecallCurve(Model50_15_full,per_class= True, classes=['Igneous','Sedimentary'],is_fitted=True)
pr.fit(X_train,y_train)
pr.score(X_test,y_test)
pr.show()
It's clear from this that the model predicts well for most rock types, except where there is significant overlap in compositions, such as in wacke, dolomite, shale and arkose. Iron-rich shales are only at about 50% accuracy. Igneous rocks look much better, but they are significantly more numerous in the dataset.
test_predictions = pd.DataFrame(Model50_15_full.predict(X_test))
test_probability = pd.DataFrame(Model50_15_full.predict_proba(X_test))
validation_predictions = pd.concat([df_test[['rock_group','rock_type']],y_test, test_predictions, test_probability],axis=1)
validation_predictions.columns = ['rock_group', 'rock_type', 'true_label', 'predicted_label','proba_0', 'proba_1']
sns.catplot(hue='predicted_label',y= 'rock_type', data=validation_predictions, height = 15,kind='count',palette='muted')
plt.xlim(0,2000)
plt.title('Count plot of calibrated lithology predictions: class 0 = igneous, class 1 = sedimentary', fontsize=16)
plt.show()